library("tidyverse")
library("knitr")
library("plotly")

Import data

eigenvectors <- readRDS("../results/2017-08-29_keratinocyte_pca_eigenvectors/eigenvectors_clusterTG.RDS")
# expression.data <- readRDS("../../data/25-08-2017_APC_E7_SCRNASEQ/keratinocyte_exp_matrix.RDS")
expression.data <- readRDS("../data/25-08-2017_APC_E7_SCRNASEQ/keratinocyte_exp_matrix.RDS")
rownames(expression.data) <- gsub("_.*", '', rownames(expression.data))

# keep.e7 <- which(rownames(expression.data) == "E7")
# expression.data <- expression.data[c(keep.e7, sample(x = seq_len(nrow(expression.data)), size = 299)), sample(x = seq_len(ncol(expression.data)), size = 200)]

Get summary statistics

expression.data %>% 
  colnames() %>% 
  strsplit("_") %>% 
  lapply("[", 1) %>% 
  unlist() -> samples

Number of cells per sample

samples %>% 
  table() %>% 
  as.data.frame() %>% 
  setNames(c("Samples", "Frequency"))
##   Samples Frequency
## 1       1      3230
## 2       2      2632
## 3       3      1771
## 4       4       907
groups <- factor(ifelse(samples == 1 | samples == 3, "WT", "TG"))

Number of cells per cluster (“Wild type” vs. “Transgenic”)

groups %>% 
  table() %>% 
  as.data.frame() %>% 
  setNames(c("Cluster", "Frequency")) %>% 
  kable()
Cluster Frequency
TG 3539
WT 5001
# Get row for E7 gene expression
e7.idx <- which(rownames(expression.data) == "E7")

# Create data frame with cell id, E7 expression, group and sample info
viral.genes <- data.frame(cell.id = names(expression.data[e7.idx,]),
                          e7 = expression.data[e7.idx,],
                          groups = factor(groups),
                          samples = factor(samples))

# Assign if a cell is infected or not depending on gene expression threshold of E7 gene
viral.genes %>% 
  mutate(status = as.factor(if_else((samples == "2" | samples == "4") & e7 > 0, 
                                    "Infected", 
                                    "Non-infected")), 
         cell.id = factor(cell.id)) -> infected.cells

Number of cells depending on infection status

# Get number of cells infected 
infected.cells %>% 
  group_by(status) %>% 
  summarise(n = n()) %>% 
  kable()
status n
Infected 726
Non-infected 7814

Number of cells depending on infection status (stratified by sample)

# Get total number of cells infected stratified by sample inf
infected.cells %>% 
  group_by(status, samples) %>% 
  summarise(n = n()) %>% 
  arrange(samples) %>% 
  kable()
status samples n
Non-infected 1 3230
Infected 2 474
Non-infected 2 2158
Non-infected 3 1771
Infected 4 252
Non-infected 4 655
transgenic.exp.data <- expression.data[,colnames(expression.data) %in% infected.cells$cell.id]


e7.idx <- which(rownames(transgenic.exp.data) == "E7")

transgenic.exp.data <- transgenic.exp.data[-e7.idx,]

# Make sure all cells in new gene expression matrix are ordered according to infected.cells data frame
# all(colnames(transgenic.exp.data) == infected.cells$cell.id)
keratinocytes <- as.tibble(cbind(eigenvectors, infected.cells))
keratinocytes <- keratinocytes %>% 
  mutate(batch = if_else(samples == 1 | samples == 2, "Batch1", "Batch2"))

PCA plots

Sample origin

plot_ly(keratinocytes, x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~samples, 
        colors = "Set3",
        marker = list(size = 2),
        size = 10, replace = TRUE) 

Cluster WT vs. TG

plot_ly(keratinocytes, x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~groups, 
        colors = "Set1",
        marker = list(size = 2)) 

Detected viral transcripts

plot_ly(keratinocytes, x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~status, 
        colors = "Set2",
        marker = list(size = 2)) 

Batch info

plot_ly(keratinocytes, x = ~PC1, y = ~PC2, z = ~PC3, 
        color = ~batch, 
        colors = "Accent",
        marker = list(size = 2))